/* The following .do file "NCREIF_EStar_#byBB.do" writen by Becka Brolinson 
will hopefully do the following:

1. Import the NCREIF data from the most recent batch
2. use work from Paige and Jessica Chu to match these data with Energy Star Obs
3. Merge with other data- unemployment, energy prices, gas prices, weather etc. 
	as seen in Jessica and Paige previous work 
	
	*note- this file uses code from both Jessica and Paige's work 
	
	The goal of this .do file is to have a workable data set for a paper for 
	Karen Palmer and Margaret Walls where the RQ: Do energy star certified 
	buildings pay lower utility bills? */
	
clear all
set more 1

*Set Directories 

global NCREIF 	"P:\Efficiency Financing\NCREIF\Summer_2017" 
global OrgData 	"$NCREIF\Original_Data"
global GenData 	"$NCREIF\Generated_Data"
global Logs 	"$NCREIF\Log_Files"
global Datasets "P:\Efficiency Financing\NCREIF\Stata\Datasets"
global Excel 	"P:\Efficiency Financing\NCREIF\Stata\Excel"
global Prices 	"P:\Efficiency Financing\NCREIF\Stata\Energy Prices"


*------------------------------------------------------------------------------*
* 						Step 1- import NCREIF data							   *
*------------------------------------------------------------------------------*
{
	forval i  =1991/2016{
		import excel "$OrgData\NCREIF\\`i'.xlsx", sheet("Query1") firstrow clear
		tempfile NCREIFdata`i'
		save "`NCREIFdata`i''"
		}
	import excel "$OrgData\NCREIF\\before1990.xlsx", sheet("Query1") firstrow clear
	tempfile before1990 
	save "`before1990'"
	
	use "`before1990'", clear
	forval i =1991/2016{
		append using "`NCREIFdata`i''"
		}
		
	save "$GenData\NCREIF_Academic_Data_updatedin2016.dta", replace 
	

	use "$GenData\NCREIF_Academic_Data_updatedin2016.dta", clear

	ren PROP prop
	tostring prop, replace
	ren YYYYQ yyyyq
	tostring yyyyq, replace
	ren Year year
	ren Quarter quarter
	ren Period period
	ren Exp_Util exp_util
	ren PropertyType propertytype
	ren SqFt sqft
	ren NoOfUnits noofunits
	ren NoOfFloors nooffloors
	ren PercentLeased percentleased
	ren Zip zip
	ren MV mv
	ren CBSA cbsa
	ren CBSADiv cbsadiv
	ren State state
	ren City city
	ren County county
	ren CBSAName cbsaname
	ren  YrBuilt yrbuilt
	ren ValuePerSF valuepersf 
	ren ValuePerUnit valueperunit
	ren Inc_BRent inc_brent
	ren MgrID mgrid
	ren  MgrName mgrname
	ren GrossSalePrice grosssaleprice
	ren NetSalePrice netsalepriceweath
	ren SalePrice saleprice
	ren Inc_Total inc_total
	ren Exp_Total exp_total
	
	save "$GenData\NCREIF_Academic_Data_updatedin2016.dta", replace

	foreach var of varlist _all {
	local name =lower("`var'")
	rename `var' `name'
	}

*Labeling the variables their labels from "FieldDefs.pdf"
	label var prop 				"Unique Property Number assigned to each property"
	label var yyyyq 			"Combined year and quarter"
	label var year 				"Reporting year of that specific quarter"
	label var quarter 			"Reporting quarter of that specific quarter"
	label var appret 			"Capital appreciation return"
	label var incret 			"Income return" 
	label var totret 			"Total return (sum of income return and capital return)"
	label var mv 				"End Market Value as of the end of the quarter"
	label var noi 				"Net operating income reported for that property"
	label var capex 			"Total capital expenditures"
	label var psales 			"Partial sales (entire sale price after the freeze)"
	label var denom 			"The denominator used in the NPI formula"
	label var capex_addacqcost 	"Cap Ex Detail � additional acquisition costs"
	label var capex_bldexpan 	"Cap Ex Detail � Building expansion"
	label var capex_bldimp 		"Cap Ex Detail � Building improvements"
	label var capex_leasecomm 	"Cap Ex Detail � Leasing commissions"
	label var capex_other 		"Cap Ex Detail � Other"
	label var capex_ti 			"Cap Ex Detail � Tenant improvements"
	label var capex_tot_calc 	"Cap Ex Detail � Calculated sum of the above detail cap ex"
	label var inc_brent 		"Detail Operating Data � Base rent"
	label var exp_util 			"Detail Operating Data � utilities"
	label var propertytype 		"Property type such as O for Office, R for Retail, etc"
	label var propertysubtype 	"The property subtype, e.g., CB for CBD, R for Regional Mall"
	label var address1 			"Address of specific property"
	label var zip 				"Zip code of the property"
	label var state 			"State where property is located"
	label var city 				"City where the property is located" 
	label var county 			"county where the property is located" 
	label var cbsa 				"The CBSA code (5-digit), e.g., 31100 for Los Angeles, CA"
	label var cbsadiv 			"The Division code (5-digit) for CBSAs that have divisions"
	label var cbsaname 			"The name of the CBSA or division, e.g., GA � Atlanta"
	label var cbsaordiv 		"The CBSA code (5-digit)for CBSAs without divisions but the division code for CBSAs with divisions, e.g., 26420 for Houston, TX"
	label var censusdivfips 	"The Fips code (5-digit) used for Census divisions (see CBSA)"
	label var acqdate 			"Date the property was acquired" 
	label var acquisitionyear 	"Year the property was acquired"
	label var acquisitiondate 	"Date the property was acquired"
	label var initialcost 		"Initial cost of the property was purchased for"
	label var initacqcost 		"Initial cost of the property was purchased for"
	label var percentleased 	"The percent leased (occupancy) ( Between .00 - 1.00)."
	label var saleqtr 			"�True� if the quarter the property sold"
	label var yearsold 			"Year the property was sold."
	label var saleprice 		"Sales price of the property � also the Net Sale Price"
	label var grosssaleprice	"The gross sale price for sold properties"
	label var netsaleprice 		"The net sale price for sold properties"
	label var mgrid 			"Unique ID specific to NCREIF that is assigned to each manager"
	label var mgrname 			"Name of manager"
	label var groundrent 		"Any ground rent paid that quarter for the property"
	label var nooffloors 		"Number of floors associated with the building"
	label var noofbuildings 	"Number of buildings associated with the building"
	label var noofunits 		"Number of units associated with the building"
	label var nra 				"Total net rentable area"
	label var sqft 				"Total square feet of the property"
	label var valuepersf 		"Market Value per Square Foot (calculated when SF available)"
	label var valueperunit 		"Market Value per Unit (calculated when units available, e.g,. for apartments"
	label var yrbuilt 			"The year (YYYY) the property was built"
	label var lastrenovatedyear "The last year (YYYY) the property was renovated"
	label var yrbuiltorlastren 	"The year (YYYY) built or the last renovated year"
	label var inc_total 		"Detail Operating Data � calculated total of the income detail"
	label var exp_total 		"Detail Operating Data � calculated total of the expense detail"
	
*Keep variables possibly relevant for our analysis
	keep 	prop-appret incret inc_total totret mv noi capex psales denom ///
			capex_addacqcost-inc_brent exp_util ///
			propertytype-censusdivfips acqdate-initacqcost value* *total ///
			lifecycle percentleased saleqtr-netsaleprice mgrid mgrname ///
			groundrent-yrbuiltorlastren
	
	save "$GenData\NCREIF_Academic_Data_updatedin2016.dta", replace

*the following code is from NCREIF_with_additional_02262017_forDataUpdate	
	use "$GenData\NCREIF_Academic_Data_updatedin2016.dta", clear
	count
	/*656050*/
	/*only office ad retail*/
	keep if inlist(propertytype, "O", "R")

	drop if cbsa == . /*675 dropped*/


	/*correct one building belongst to different cbsa and zip code*/
	destring zip, replace
	bysort prop: egen zip_mode = mode(zip)
	count if zip_mode ==.  /*0*/


	bysort prop: egen cbsaname_mode = mode(cbsaname)
	bysort prop: egen cbsa_mode = mode(cbsa)
	bysort prop: egen cbsadiv_mode = mode(cbsadiv)

	count if cbsaname_mode == ""
	count if cbsa_mode ==. /*0*/
	count if cbsadiv_mode ==.
		
	order prop cbsa*
	br if cbsaname_mode == "" /*seems ok*/
	drop cbsaname_mode
	
	
drop *mode

*Define Some Variables
destring exp_util, replace
destring sqft, replace
gen utilpersf = exp_util/sqft
gen logutilpersf = log(utilpersf)


gen state2 = substr(trim(cbsaname), 1, 3)
replace state2 = subinstr(state," ","",.)
replace state2 = subinstr(state,"-","",.)
count if state != state2 /*0*/
drop state2


/*some modifications for City*/
/*this new data has better quality on city*/
replace city = proper(city)
replace city = subinstr(city,`"""',"",.)

//create covered_e variable based on sqft and city

gen Covered_E = 1 if city == "Austin" & period >= 140 & sqft >=75000
replace Covered_E = 1 if city == "Austin" & period >= 144 & sqft >=30000
replace Covered_E = 1 if city == "New York" & period >= 138 & sqft >=50000
replace Covered_E = 1 if city  == "San Francisco" & period >= 138 & sqft >=50000
replace Covered_E = 1 if city == "San Francisco" & period >= 140 & sqft >=25000
replace Covered_E = 1 if city == "San Francisco" & period >= 144 & sqft >=10000
replace Covered_E = 1 if city == "Seattle" & period >= 140 & sqft >=50000
replace Covered_E = 1 if city == "Seattle" & period >= 144 & sqft >=20000

//Philly >50k 2014Q1
replace Covered_E = 1 if city == "Philadelphia" & period >= 146 & sqft >= 50000

//2014q3: Austin >10k; Chicago >250k; Minneapolis >100k
replace Covered_E = 1 if city == "Austin" & period >= 148 & sqft >= 10000
replace Covered_E = 1 if city == "Chicago" & period >= 148 & sqft >= 250000
replace Covered_E = 1 if city == "Minneapolis" & period >= 148 & sqft >= 100000

//2014q4: Boston >50k
replace Covered_E = 1 if city == "Boston" & period >= 149 & sqft >= 50000

//2015q2: Minneapolis >50k
replace Covered_E = 1 if city == "Minneapolis" & period >= 151 & sqft >= 50000

//2015q3: Cambridge >50k; Chicago: >50k
replace Covered_E = 1 if city == "Cambridge" & period >= 152 & sqft >= 50000
replace Covered_E = 1 if city == "Chicago" & period >= 152 & sqft >= 50000

//2015q4 Atlanta >50k
replace Covered_E = 1 if city == "Atlanta" & period >= 153 & sqft >= 50000


replace Covered_E = 0 if Covered_E ==.


*Set as Panel
replace period = period + 70
gen propnum = real(prop)

count /*247133*/

xtset propnum period, quarterly
format period %tq


*Drop observations not from population
/*drop observations before 2000*/
drop if period < 160

save "$GenData\NCREIF_Academic_Data_updatedin2016_mod.dta", replace

********************************************************************************
use "$GenData\NCREIF_Academic_Data_updatedin2016_mod.dta", clear


/*only office ad retail*/
tab propertytype
keep if inlist(propertytype, "O", "R")

*Export Zip and CBSA Lists
keep zip
duplicates drop zip, force

save "$GenData\ziplist2016.dta", replace


use "$GenData\NCREIF_Academic_Data_updatedin2016_mod.dta", clear
keep cbsa
duplicates drop cbsa, force

save "$GenData\cbsalist2016.dta", replace
}
*------------------------------------------------------------------------------*
*			Step 2- merge NCREIF Data with other control var data 			   *
*------------------------------------------------------------------------------*
{
**************UNEMPLOYMENT********************************
******Match CBSA Code to CBSA Title
*************************************
*Using the data set that Jessica creates in lines 207-304 in NCREIF_with_additional_01262017_Fordataupdate

use "P:\Efficiency Financing\NCREIF\Stata\Unemployment\Unemployment2016.dta", clear
save "$GenData\Unemployment2016.dta", replace

*Merge with NCREIF Data
use "$GenData\NCREIF_Academic_Data_updatedin2016_mod.dta", replace
merge m:1 cbsa yyyyq using "$GenData\Unemployment2016.dta"

keep if _merge == 3
drop _merge
save "$GenData\NCREIFUnemployment2016.dta", replace

////////////////////////////////////////
/*Elec price*/
////////////////////////////////////
use "$Datasets\ncreif_elec_final_2000to2015.dta", clear
generate quarter = inlist(month, 1, 2, 3)
replace quarter = 2 if inlist(month, 4, 5, 6)
replace quarter = 3 if inlist(month, 7, 8, 9)
replace quarter = 4 if inlist(month, 10, 11, 12)
rename price elecprice
collapse (mean) elecprice, by(zip year quarter)

sum year

sum quarter

gen Zip = real(zip)
drop zip
rename Zip zip

/*NCREIF + Electricity Price*/
merge m:1 zip using "$GenData\ziplist2016.dta"


keep if _merge == 3
drop _merge

/*NCREIF&Unemployment + Electricity Price*/
merge 1:m year quarter zip using "$GenData\NCREIFUnemployment2016.dta"

keep if _merge == 3
drop _merge

save "$GenData\NCREIFElectricity2016.dta", replace

/////////////////////////////////////////////////////////////////////////////////////////////
**********HEATING AND COOLING DEGREE DAYS*************************
use "$Datasets\US_temperature_byZipcode_2000to2015.dta", clear
keep year quarter temperature_K zcta5ce10
/*
from 2000 1st quarter to 2013 4th quarter*/

compress
save "$GenData\Temperatures_2000to2015.dta", replace
//////////////////////////////////////////////

use "$GenData\Temperatures_2000to2015.dta", clear
gen temperature = (9/5)*((temperature_K*.02)-273)+32
gen HDD = (65 - temperature)*8 if temperature < 65 & temperature_K != 0
replace HDD = 0 if HDD == .
gen CDD = (temperature - 65)*8 if temperature > 65 & temperature_K != 0
replace CDD = 0 if CDD == .
gen missingtemp = temperature_K == 0
gen count = 1
rename zcta5ce10 zip
collapse (sum) HDD CDD missingtemp count, by (year quarter zip)
foreach x in HDD CDD {
	replace `x' = `x' * (count/(count-missingtemp))
	}
drop if HDD == . | CDD == .
merge 1:m year zip quarter using "$GenData\NCREIFElectricity2016.dta"
keep if _merge == 3
drop _merge
save "$GenData\NCREIFWeather2016.dta", replace

****************CPI ADJUSTMENT************************************
*Generate CPI Dataset
import excel "$Excel\CPI2016.xls", firstrow clear cellrange(A11:M28)
foreach var of varlist Jan-Dec {
	rename `var' CPI`var'
	}
reshape long CPI, i(Year) j(Month) string
gen quarter = inlist(Month, "Jan", "Feb", "MaRetail")
replace quarter = 2 if inlist(Month, "ApRetail", "May", "Jun")
replace quarter = 3 if inlist(Month, "Jul", "Aug", "Sep")
replace quarter = 4 if inlist(Month, "Oct", "Nov", "Dec")
collapse(mean) CPI, by(Year quarter)
gen yyyyq = 10*Year + quarter
tostring yyyyq, replace
save "$GenData\CPI2016.dta", replace

//GAS price

*********GAS PRICES***************************************************
use "$Prices\ncreif_gas_final_2000to2015.dta", clear
rename com_price_tcf gasprice
merge m:1 zip using "$GenData\ziplist2016.dta"
keep if _merge == 3
drop _merge

save "$GenData\NCREIFGas_2000to2015.dta", replace

///////////////////////////////////////////////////////////////////

*Merge with NCREIF and generate adjusted variables
use "$GenData\NCREIFWeather2016.dta", clear
merge m:1 year zip using "$GenData\NCREIFGas_2000to2015.dta"
/*396 obs missing gas price, keep them in for now*/
drop if _merge==2
drop _merge
merge m:1 yyyyq using "$GenData\CPI2016.dta"
keep if _merge == 3
drop _merge
foreach x in inc_total exp_total exp_util valuepersf valueperunit utilpersf elecprice gasprice inc_brent  {
	gen real_`x' = `x' * (170.1 / CPI)
	}
gen logrealutilpersf = log(real_utilpersf)
save "$GenData\NCREIFAdjusted2016.dta", replace

}
*------------------------------------------------------------------------------*
*		Step 2.5- Before Cleaning- Testing Compiling to Annual Data			   *
*------------------------------------------------------------------------------*
{
/*Here I am testing compiling the annual data before we clean the data 
The purpose of this is to compare the variables of interest if we make the data 
annual  before versus after cleaning */

*In meeting with Margaret Walls in 6/6/2017 We decide that we should 
*use the data before it's been cleaned to aggregate up to the yearly level
use "$GenData\NCREIFAdjusted2016.dta", replace

/*gen new variables*/
destring exp_util, replace
destring exp_total, replace
gen util_pct = exp_util/exp_total *100
gen logrealutilpersf2 = ln(1+real_utilpersf)

*Only looking at office buildings 
	keep if propertytype == "O"
*Merge with estar data and generate the same variables as in step 3 to compare 
*this data set with the post-cleaned data set 
{	
*making a temporary file to merge this data with the estar data 
	tempfile PreCleanAnnualTest
	save `PreCleanAnnualTest' 
	
	merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_new_filledwithEstar.dta"
	gen ever_merged=1 if _merge==3
	drop _merge

	merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_01292015_old_filledwithEstar.dta"
	replace ever_merged=1 if _merge==3
	drop if _merge==2
	drop _merge
	rename rating modify_rating1

	merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_01292015_old_filledwithEstar2.dta"
	replace ever_merged=1 if _merge==3
	drop if _merge==2
	drop _merge
	
*Now, I am only interested in the variables for which we have estar information 
*so I drop the observations that never got estar information 
	drop if ever_merged==.
	
*Create the same variables as in step 3-
count if modify_rating!=. & modify_rating1!=. & rating !=.
replace rating = modify_rating if modify_rating !=. & rating==.
replace rating = modify_rating1 if modify_rating1 !=. & rating==.
drop modify_rating*

sort prop year 

gen ind_yr = year if rating!=.
sum year

*Now, I generate a value for the years since a building was rated 
bysort prop: carryforward ind_yr, gen(last_year_cert) 

gen years_since_cert= year-last_year_cert

*Value of certification has been suggested to decrease over time so, let's gen an indicator for years since cert 
*First, 5 years or less since certification 
gen ind_cert_last5=1 if !missing(years_since_cert) & years_since_cert<=5

*10 years or less since certification 
gen ind_cert_last10=1 if !missing(years_since_cert) & years_since_cert<=10
}

*Follow the same steps as in part 5 

*First, assess number of observations that do not have the full set of quarterly obs
*and generate a measure of which quarters are missing 

*The following stata code is from Paige Gans work (modified by me) from her file 
*NCREIF Energy Star merge.do starting from line 812

//if duplicate quarters 
	duplicates tag propnum year quarter, gen(dup_qtr)
	*no duplicates in the data 
	
//create indicator for first observation of a given propnum
	bys propnum: gen firstobs=1 if _n==1
	
//tag years with values different than 4 quarters of data
	duplicates tag propnum year, gen(noqts)
	gen numqts=noqts+1
	capture drop noqts
		
//record number of nonzero quarters per year
	gen non0=(exp_util!=0)
	bys propnum year: egen non0qts=total(non0)
	*12,117 observations have a 0 
		
//tag buildings with at least one complete year
	bys propnum: egen complete=max(numqts)
	/*
	tab complete

   complete |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |        167        0.17        0.17
          2 |        655        0.67        0.84
          3 |      1,512        1.55        2.39
          4 |     95,214       97.61      100.00
------------+-----------------------------------
      Total |     97,548      100.00

	*/

	//Generate a measure of utility expenditure for each quarter- this will help in
	//future when we take ratios to generate a yearly measure 
	gen qt1=exp_util if quarter==1
	gen qt2=exp_util if quarter==2
	gen qt3=exp_util if quarter==3
	gen qt4=exp_util if quarter==4
	
	*I should ask margaret and Karen why this is the method they decided to pick- 
	*First I some summary statistics of what exactly we are missing 
	bys propnum year: egen qtr1=total(qt1), missing
	bys propnum year: egen qtr2=total(qt2), missing
	bys propnum year: egen qtr3=total(qt3), missing
	bys propnum year: egen qtr4=total(qt4), missing
	
	*Generate a value that tells us which quarter is missing for a particular year-property pair 
	gen miss_qt1=1 if missing(qtr1)
	gen miss_qt2=1 if missing(qtr2)
	gen miss_qt3=1 if missing(qtr3)
	gen miss_qt4=1 if missing(qtr4)
	
	
	
	*Count if nothing is missing 
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 1 
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 2
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 3
	count if miss_qt1==. & miss_qt2==. & miss_qt3==1 & miss_qt4==.
	*Count if missing only quarter 4
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==1
	*Count if  missing both q1&q2
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==. & miss_qt4==.
	*Count if  missing both q1&q3
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==1 & miss_qt4==.
	*Count if  missing both q1&q4
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==1
	*count if missing both q2&q3
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==1 & miss_qt4==.
	*count if missing both q2&q4
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==. & miss_qt4==1
	*count if missing both q3&q4
	count if miss_qt1==. & miss_qt2==. & miss_qt3==1 & miss_qt4==1
	*Count if missing q1,q2&q3
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==1 & miss_qt4==.
	*Count if missing q1,q2&q4
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==. & miss_qt4==1
	*Count if missing q1,q3&q4
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==1 & miss_qt4==1
	*Count if missing q2,q3,&q4
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==1 & miss_qt4==1
	
	*Now, checking out the number of estar buildings we have in the data 
	*No of obs with estar rating 
	count if rating!=.
	*No of Obs. that have ever been rated
	bysort propnum: egen everrated=min(rating)
	count if everrated!=.
	*No of obs. that have been rated in the last 5 or 10 years 
	count if ind_cert_last5==1
	count if ind_cert_last10==1
	*No ob obs that have never been estar rated 
	count if everrated==.
	*No of unique propnum in the data set- ie the number of unique buildings in the data set 
	bysort propnum: gen uniquebuilding=1 if _n==1
	count if uniquebuilding==1
	*Find the number of builings that have ever/never been certified 
	gen buildingevercert=1 if uniquebuilding==1 & everrated!=.
	count if buildingevercert==1
	gen buildingnevercert=1 if uniquebuilding==1 & everrated==.
	count if buildingnevercert==1



*now, I want to see how bad the problem is when we compress the 
	*data into annual observations, I will do the above analysis for the anuual data below 
	
	preserve
	collapse (min) miss_qt1 miss_qt2 miss_qt3 miss_qt4 rating last_year_cert ///
	years_since_cert ind_cert_last5 ind_cert_last10, by(propnum year)
	*Count if nothing is missing 
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 1 
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 2
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 3
	count if miss_qt1==. & miss_qt2==. & miss_qt3==1 & miss_qt4==.
	*Count if missing only quarter 4
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==1
	*Count if  missing both q1&q2
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==. & miss_qt4==.
	*Count if  missing both q1&q3
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==1 & miss_qt4==.
	*Count if  missing both q1&q4
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==1
	*count if missing both q2&q3
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==1 & miss_qt4==.
	*count if missing both q2&q4
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==. & miss_qt4==1
	*count if missing both q3&q4
	count if miss_qt1==. & miss_qt2==. & miss_qt3==1 & miss_qt4==1
	*Count if missing q1,q2&q3
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==1 & miss_qt4==.
	*Count if missing q1,q2&q4
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==. & miss_qt4==1
	*Count if missing q1,q3&q4
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==1 & miss_qt4==1
	*Count if missing q2,q3,&q4
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==1 & miss_qt4==1
	
	*Now, checking out the number of estar buildings we have in the data 
	*No of obs with estar rating 
	count if rating!=.
	*No of Obs. that have ever been rated
	bysort propnum: egen everrated=min(rating)
	count if everrated!=.
	*No of obs. that have been rated in the last 5 or 10 years 
	count if ind_cert_last5==1
	count if ind_cert_last10==1
	*No ob obs that have never been estar rated 
	count if everrated==.
	*No of unique propnum in the data set- ie the number of unique buildings in the data set 
	bysort propnum: gen uniquebuilding=1 if _n==1
	count if uniquebuilding==1
	*Find the number of builings that have ever/never been certified 
	gen buildingevercert=1 if uniquebuilding==1 & everrated!=.
	count if buildingevercert==1
	gen buildingnevercert=1 if uniquebuilding==1 & everrated==.
	count if buildingnevercert==1
	restore
	
	*Figure out if the missing data problem for the the buildings is 
	*when they are joining/leaving the data set 
	*We could probably see this if the first quarter observation 
	*is missing *right* when the building joins the data set 
	by propnum: egen firstyearindata=min(year)
	by propnum: egen lastyearindata=max(year)
	
	*Count the number of observations where the first quarter is missing 
	*In the first year that the data is in the set 
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==. & firstyearindata==year
	*If missing q4 is in the last year the building is in the data set 
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==1 & lastyearindata==year
	
	
	*Now the goal is to generate a measure of real yearly utility expenditure. 

	*Save this data set so that I can make the "fake data" with the missign 
	*quarter observations 
	save "$GenData\NCREIF_wmissingquartersinfo.dta", replace 
	
*First, if all of the quarter observations in the data exist, sum them 
	bys propnum year: egen yr_exp_util=total(real_exp_util) if numqts==4

*Now, if not all of the quarterly observations exist take the ratio of the 
*real utility expenditures, then take the median observation 
*for all of the observations for each propnum over the yeras 
set more off	
forvalues i=1/4 {	
	forvalues j=1/4 {
	gen qt`i'_`j'=qtr`i'/qtr`j' 
	replace qt`i'_`j'=. if qtr`i'<=0 | qtr`j'<=0
	}
	}
	

	//use median ratio to avoid pull of outliers
	forvalues i=1/4 {
		forvalues j=1/4 {
		bysort propnum: egen med`i'_`j'=median(qt`i'_`j') 
		}
		}
*Now, if there is at least 1 complete year for a given building (marked by complete==4)
*then we will be able to use the median values to produce a measure of the missing quarters

	*If 1 quarter is missing:
	*Missing q1 
	replace 	yr_exp_util=(qtr2*med1_2+qtr3*med1_3+qtr4*med1_4)/3+qtr2+qtr3+qtr4 ///
				if qtr2!=. & qtr3!=. & qtr4!=. & numqts==3 & complete==4
	*Missing q2
	replace 	yr_exp_util=qtr1+(qtr1*med2_1+qtr3*med2_3+qtr4*med2_4)/3+qtr3+qtr4 ///
				if qtr1!=. & qtr3!=. & qtr4!=. & numqts==3 & complete==4
	*Missing q3 
	replace 	yr_exp_util=qtr1+qtr2+(qtr1*med3_1+qtr2*med3_2+qtr4*med3_4)/3+qtr4 ///
				if qtr1!=. & qtr2!=. & qtr4!=. & numqts==3 & complete==4
				
	*Missing q4 
	replace 	yr_exp_util=qtr1+qtr2+qtr3+(qtr1*med4_1+qtr2*med4_2+qtr3*med4_3)/3 ///
				if qtr1!=. & qtr2!=. & qtr3!=. & numqts==3 & complete==4
				
	*If 2 quarters are missing 
	*Missing q1&q2
	replace 	yr_exp_util=(qtr3*med1_3+qtr4*med1_4)/2+(qtr3*med2_3+qtr4*med2_4)/2+qtr3+qtr4 ///
				if qtr3!=. & qtr4!=. & numqts==2 & complete==4
	*missing q1&q3
	replace		yr_exp_util=(qtr2*med1_2+qtr4*med1_4)/2+qtr2+(qtr2*med3_2+qtr4*med3_4)/2+qtr4 ///
				if qtr2!=. & qtr4!=. & numqts==2 & complete==4
	*missing q1&q4
	replace 	yr_exp_util=(qtr2*med1_2+qtr3*med1_3)/2+qtr2+qtr3+(qtr2*med4_2+qtr3*med4_3)/2 ///
				if qtr2!=. & qtr3!=. & numqts==2 & complete==4			
	*missing q2&q3
	replace 	yr_exp_util=qtr1+(qtr1*med2_1+qtr4*med2_4)/2+(qtr1*med3_1+qtr4*med3_4)/2+qtr4 ///
				if qtr1!=. & qtr4!=. & numqts==2 & complete==4
	*missing q3&q4
	replace 	yr_exp_util=qtr1+qtr2+(qtr1*med3_1+qtr2*med3_2)/2+(qtr1*med4_1+qtr2*med4_2)/2 ///
				if qtr1!=. & qtr2!=. & numqts==2 & complete==4
	*missing q2&q4
	replace		yr_exp_util=qtr1+(qtr1*med2_1+qtr3*med2_3)/2+qtr3+(qtr1*med4_1+qtr3*med4_3)/2 ///
				if qtr1!=. & qtr3!=. & numqts==2 & complete==4
				
	*If 3 quarters are missing 
	*Have q1
	replace 	yr_exp_util=qt1+qt1*med2_1+qt1*med3_1+qt1*med4_1 ///
				if qt1!=. & numqts==1 & complete==4
	*Have q2
	replace 	yr_exp_util=qt2+qt2*med1_2+qt2*med3_2+qt2*med4_2 ///
				if qt2!=. & numqts==1 & complete==4
	*Have q3
	replace 	yr_exp_util=qt3+qt3*med2_3+qt3*med1_3+qt3*med4_3 ///
				if qt3!=. & numqts==1 & complete==4
	*Have q4
	replace 	yr_exp_util=qt4+qt4*med2_4+qt4*med3_4+qt4*med1_4 ///
				if qt4!=. & numqts==1 & complete==4
				
*now, there are 3431 observations that don't have a yearly observation, 
*this is because the obs never has a fully complete year
*Paige makes the choice to use the median value for a particular state 
*not sure if this is the best idea, but I will use it for now and come 
*back later- because by state isn't taking into account sqft so I 
*think this measure will be pretty off- ask Karen and Margaret 

//state ratio for props without a complete year
		drop med*
			forvalues i=1/4 {
		forvalues j=1/4 {
		bys state: egen med`i'_`j'=median(qt`i'_`j')
		}
		}
				
	*If 1 quarter is missing:
	*Missing q1 
	replace 	yr_exp_util=(qtr2*med1_2+qtr3*med1_3+qtr4*med1_4)/3+qtr2+qtr3+qtr4 ///
				if qtr2!=. & qtr3!=. & qtr4!=. & numqts==3 & yr_exp_util==.
	*Missing q2
	replace 	yr_exp_util=qtr1+(qtr1*med2_1+qtr3*med2_3+qtr4*med2_4)/3+qtr3+qtr4 ///
				if qtr1!=. & qtr3!=. & qtr4!=. & numqts==3 & yr_exp_util==.
	*Missing q3 
	replace 	yr_exp_util=qtr1+qtr2+(qtr1*med3_1+qtr2*med3_2+qtr4*med3_4)/3+qtr4 ///
				if qtr1!=. & qtr2!=. & qtr4!=. & numqts==3 & yr_exp_util==.
				
	*Missing q4 
	replace 	yr_exp_util=qtr1+qtr2+qtr3+(qtr1*med4_1+qtr2*med4_2+qtr3*med4_3)/3 ///
				if qtr1!=. & qtr2!=. & qtr3!=. & numqts==3 & yr_exp_util==.
				
	*If 2 quarters are missing 
	*Missing q1&q2
	replace 	yr_exp_util=(qtr3*med1_3+qtr4*med1_4)/2+(qtr3*med2_3+qtr4*med2_4)/2+qtr3+qtr4 ///
				if qtr3!=. & qtr4!=. & numqts==2 & yr_exp_util==.
	*missing q1&q3
	replace		yr_exp_util=(qtr2*med1_2+qtr4*med1_4)/2+qtr2+(qtr2*med3_2+qtr4*med3_4)/2+qtr4 ///
				if qtr2!=. & qtr4!=. & numqts==2 & yr_exp_util==.
	*missing q1&q4
	replace 	yr_exp_util=(qtr2*med1_2+qtr3*med1_3)/2+qtr2+qtr3+(qtr2*med4_2+qtr3*med4_3)/2 ///
				if qtr2!=. & qtr3!=. & numqts==2 & yr_exp_util==.			
	*missing q2&q3
	replace 	yr_exp_util=qtr1+(qtr1*med2_1+qtr4*med2_4)/2+(qtr1*med3_1+qtr4*med3_4)/2+qtr4 ///
				if qtr1!=. & qtr4!=. & numqts==2 & yr_exp_util==.
	*missing q3&q4
	replace 	yr_exp_util=qtr1+qtr2+(qtr1*med3_1+qtr2*med3_2)/2+(qtr1*med4_1+qtr2*med4_2)/2 ///
				if qtr1!=. & qtr2!=. & numqts==2 & yr_exp_util==.
	*missing q2&q4
	replace		yr_exp_util=qtr1+(qtr1*med2_1+qtr3*med2_3)/2+qtr3+(qtr1*med4_1+qtr3*med4_3)/2 ///
				if qtr1!=. & qtr3!=. & numqts==2 & yr_exp_util==.
				
	*If 3 quarters are missing 
	*Have q1
	replace 	yr_exp_util=qt1+qt1*med2_1+qt1*med3_1+qt1*med4_1 ///
				if qt1!=. & numqts==1 & yr_exp_util==.
	*Have q2
	replace 	yr_exp_util=qt2+qt2*med1_2+qt2*med3_2+qt2*med4_2 ///
				if qt2!=. & numqts==1 & yr_exp_util==.
	*Have q3
	replace 	yr_exp_util=qt3+qt3*med2_3+qt3*med1_3+qt3*med4_3 ///
				if qt3!=. & numqts==1 & yr_exp_util==.
	*Have q4
	replace 	yr_exp_util=qt4+qt4*med2_4+qt4*med3_4+qt4*med1_4 ///
				if qt4!=. & numqts==1 & yr_exp_util==.
				
	*Now, the only observation missing the yearly data is in Charleston, WV
	*This is because this is the only observation for WV 
	
*make the var of interest- yearly real utility expenditure per square foot 
	bysort propnum year: egen yr_exp_util_max=max(yr_exp_util)
	bysort propnum year: egen yr_exp_util_mode=mode(yr_exp_util)
	br if yr_exp_util_mode!=yr_exp_util_max
	by propnum year: egen sqft_max=max(sqft)
	by propnum year: egen sqft_mode=mode(sqft), max
	br if sqft_mode!=sqft_max
	by propnum year: egen sqft_min=min(sqft)
	br if sqft_mode!=sqft_min
	br if sqft_min!=sqft_max
	*If the observations don't have the same square footage in a year, take the mode
	*Check with Margaret and Karen about this 
	replace sqft=sqft_mode 
	*This replaces the information for 3525 observations 
	gen rl_yr_utilpersf=yr_exp_util/sqft
	
	drop if rl_yr_utilpersf==0 | rl_yr_utilpersf==.
	*10509 
	drop if rl_yr_utilpersf>12.55
	drop if rl_yr_utilpersf<0
	
	*collapse (min) rating last_year_cert years_since_cert ind_cert_last5 ///
	*ind_cert_last10 rl_yr_utilpersf, by(propnum year)
*Now, generating some summary statistics on real utility per sqft based on 
*whether buildings are cert or not
	su rl_yr_utilpersf, detail
	su rl_yr_utilpersf if rating!=., detail
	su rl_yr_utilpersf if rating==., detail
	su rl_yr_utilpersf if ind_cert_last5==1, detail
	su rl_yr_utilpersf if ind_cert_last5==., detail
	su rl_yr_utilpersf if ind_cert_last10==1, detail
	su rl_yr_utilpersf if ind_cert_last10==., detail
	*Now I want builiding average information- ie average util per sf
	*if the building has ever been certified versus never been certified 
	*if a building has ever been rated

	su rl_yr_utilpersf if everrated!=.
	su rl_yr_utilpersf if everrated==.
	*Generate a unique building id
	preserve
	drop if uniquebuilding!=1
	su rl_yr_utilpersf if everrated!=.
	su rl_yr_utilpersf if everrated==.
	restore
	
	by propnum: egen firstyearrated=min(last_year_cert)
	*trying to find the mean of the property before and after 
	*certification 
	count if firstyearrated!=.
	su rl_yr_utilpersf if !missing(firstyearrated) & year<firstyearrated, detail
	su rl_yr_utilpersf if !missing(firstyearrated) & year>=firstyearrated, detail
	
		*Making historgrams 
	*ssc describe eqprhistogram
	*ssc install eqprhistogram
	*Full sample histogram 
	eqprhistogram rl_yr_utilpersf, bin(10) ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("Full Sample") bgcolor(white) graphregion(color(white))
	*Histogram if certified in last 5 years
	preserve
	drop if ind_cert_last5==.
	eqprhistogram rl_yr_utilpersf, bin(10) ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("Certified in Last 5 Years") bgcolor(white) graphregion(color(white))
	restore
	
	*Histogram if not certified in last 5 years 
	preserve 
	drop if ind_cert_last5==.
	eqprhistogram rl_yr_utilpersf, bin(10) ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("Not Certified in Last 5 Years") bgcolor(white) graphregion(color(white))
	restore
	
	*Histogram of data before properties have been certified 
	preserve 
	keep if !missing(firstyearrated) & year<firstyearrated
	eqprhistogram rl_yr_utilpersf, bin(10) ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("Before Certification") bgcolor(white) graphregion(color(white))
	restore
	
	*Histogram of data after properties have been certified 
	preserve 
	keep if !missing(firstyearrated) & year>=firstyearrated
	eqprhistogram rl_yr_utilpersf, bin(10)  ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("After Certification") bgcolor(white) graphregion(color(white))
	restore
	
	

	
	
}
*------------------------------------------------------------------------------*
*				Step 3- Clean and Merge with EStar Data 					   *
*------------------------------------------------------------------------------*
{

//CLEAN and get E star information for this new data
use "$GenData\NCREIFAdjusted2016.dta", replace

/*gen new variables*/
destring exp_util, replace
destring exp_total, replace
gen util_pct = exp_util/exp_total *100
gen logrealutilpersf2 = ln(1+real_utilpersf)

**************
/*clean*/
**************
/*1.yrbuilt*/
bysort propnum: egen yrbuilt_max = max(yrbuilt)
bysort propnum: egen yrbuilt_mode = mode(yrbuilt),maxmode
count if yrbuilt_mode ==0 & yrbuilt_max>0 /*some buildings have different year built*/

replace yrbuilt_mode = yrbuilt_max if yrbuilt_max>0 & yrbuilt_mode ==0
rename yrbuilt orig_yrbuilt
rename yrbuilt_mode yrbuilt
count if yrbuilt==. /*8121*/
count if yrbuilt ==0  /*29*/
drop yrbuilt_max


/*2. sqft*/
sum sqft
pctile pct = sqft, nq(100)
list pct in 1/100
capture drop pct
count if sqft <10000
count if sqft > 1000000 | sqft == . 
drop if sqft <10000
drop if sqft > 1000000 | sqft == .

/*3.exp_total and exp_total*/ 
count if exp_total <=0 /*13423*/
drop if exp_total <=0
drop if exp_util >= exp_total

/*4.percentleased*/

sum percentleased
count if percentleased == . 
count if percentleased >1 & percentleased!=.
replace percentleased = 1 if percentleased >1 & percentleased !=.
drop if percentleased == .
drop if percentleased>1

/*5.nooffloors*/
sort propnum nooffloors
bysort propnum: carryforward nooffloors, gen(filled_nooffloors) 
count if nooffloors ==. /*257*/
drop if nooffloors ==.

/*6. real_utilpersf, and util_pct*/
sum real_utilpersf if real_utilpersf>0
sum util_pct if util_pct >0
drop if real_utilpersf ==. | real_utilpersf<=0
pctile pct = real_utilpersf, nq(100)
list pct in 1/100
capture drop pct
drop if real_utilpersf > 3.25
pctile pct = util_pct, nq(100)
list pct in 1/100
capture drop pct
drop if util_pct<0.4973874  |  util_pct > 43.9869


/*7. electricity price*/
drop if elecprice==.


/*drop DC
count if city == "Washington" & state == "DC"
drop if city == "Washington" & state == "DC"*/

//office buildings
keep if propertytype == "O"

tab city if Covered_E ==1

save "$GenData\NCREIFAdjusted2016_CleanedforMerge.dta", replace

/* The following part of this .do file "NCREIF_EStar_#byBB.do" writen by Becka Brolinson 
will hopefully do the following:

1. Merge the estar information from the work of Paige and JC
2. The goal of this is to incorporate the energystar data into the new data set from NCREIF
3. Then, I will check the old data with the new data to see what obs are different 
	and then get the estar information for the obs that are different in the new data set */

use "$GenData\NCREIFAdjusted2016_CleanedforMerge.dta", clear

merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_new_filledwithEstar.dta"
drop _merge

merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_01292015_old_filledwithEstar.dta"
drop if _merge==2
drop _merge
rename rating modify_rating1

merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_01292015_old_filledwithEstar2.dta"
drop if _merge==2
drop _merge

count if modify_rating!=. & modify_rating1!=. & rating !=.
replace rating = modify_rating if modify_rating !=. & rating==.
replace rating = modify_rating1 if modify_rating1 !=. & rating==.
drop modify_rating*

sort prop year 

gen ind_yr = year if rating!=.
sum year

*Install carryforward command
*Now, I generate a value for the years since a building was rated 
bysort prop: carryforward ind_yr, gen(last_year_cert) 

gen years_since_cert= year-last_year_cert

*Value of certification has been suggested to decrease over time so, let's gen an indicator for years since cert 
*First, 5 years or less since certification 
gen ind_cert_last5=1 if !missing(years_since_cert) & years_since_cert<=5

*10 years or less since certification 
gen ind_cert_last10=1 if !missing(years_since_cert) & years_since_cert<=10

save "$GenData\NCREIFAdjusted2016_CleanedwEStar.dta", replace
}
*------------------------------------------------------------------------------*
*			Step 4- check for differences between new & old data 			   *
*------------------------------------------------------------------------------*
{
*Checking which observations are different between the new and the old data set 
*this is a possible problem because of the difficulties of merging in the energy star
*data. So, I want to check which observations are not merging with energy star 
use "$Datasets\Combined_NCREIF_012015.dta", clear
keep if inlist(propertytype, "O", "R")
drop if year<2000
keep year quarter zip prop period yyyyq 
duplicates tag year quarter zip prop period yyyyq, generate(dup)
tempfile UncleanedOld
save "`UncleanedOld'"

use "$GenData\NCREIF_Academic_Data_updatedin2016.dta", clear 
keep if inlist(propertytype, "O", "R")
keep year quarter zip prop period yyyyq 
drop if year<2000 
drop if year>2013
destring zip, replace
duplicates tag year quarter zip prop period yyyyq, generate(dup)
merge 1:1 year quarter zip prop yyyyq using "`UncleanedOld'"

keep if _merge==1
capture drop _merge 
destring prop, gen(propnum)

merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_new_filledwithEstar.dta"
drop if _merge==2
gen ever_merged=1 if _merge==3
drop _merge

merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_01292015_old_filledwithEstar.dta"
drop if _merge==2
replace ever_merged=1 if _merge==3
drop _merge
rename rating modify_rating1

merge m:1 propnum year using "$Datasets\\NCREIFAdjusted_01292015_old_filledwithEstar2.dta"
drop if _merge==2
replace ever_merged=1 if _merge==3
drop _merge

drop if ever_merge==1
duplicates tag year quarter zip prop period yyyyq, generate(duplicate)
save "$GenData\DataDiffOldvsNew.dta", replace

*Now, I want to check how many of these values are in the cleaned data set that 
*we actually care about

use "$GenData\NCREIFAdjusted2016_CleanedwEStar.dta", clear 
merge m:1 year quarter zip prop period yyyyq using "$GenData\DataDiffOldvsNew.dta"

/*None of these observations that do not match with energy star data 
are in the cleaned data set, thus this is not a problem we need to 
worry about, but i want to come back and check this again later after I clean 
the data according to the possibilities of this new project 

    Result                           # of obs.
    -----------------------------------------
    not matched                        81,342
        from master                    81,006  (_merge==1)
        from using                        336  (_merge==2)

    matched                                 0  (_merge==3)
    -----------------------------------------

*/
}
*------------------------------------------------------------------------------*
*				Step 5- Go from Quarterly to Yearly Data					   *
*------------------------------------------------------------------------------*
{
use "$GenData\NCREIFAdjusted2016_CleanedwEStar.dta", clear

/*First, I want to be clear about the fact that we are making the assumption that 
	cleaning the data first and then taking the yearly measure is a choice 
	made by Karen and Margaret as per our meeting on 5/30/17. Their impression is 
	that if there is a "mistake" in an observation that it is not made up 
	in a later quarter. So, we decided that first cleaning the data 
	and then summing/averaging to make the data yearly is the best way to go */
	
	
*First, assess number of observations that do not have the full set of quarterly obs
*and generate a measure of which quarters are missing 

*The following stata code is from Paige Gans work (modified by me) from her file 
*NCREIF Energy Star merge.do starting from line 812

//if duplicate quarters 
	duplicates tag propnum year quarter, gen(dup_qtr)
	*no duplicates in the cleaned data 
	
//create indicator for first observation of a given propnum
	bys propnum: gen firstobs=1 if _n==1
	
//tag years with values different than 4 quarters of data
	duplicates tag propnum year, gen(noqts)
	gen numqts=noqts+1
	capture drop noqts
		
//record number of nonzero quarters per year
	gen non0=(exp_util!=0)
	bys propnum year: egen non0qts=total(non0)
	*All utilities now have a measure of exp_util (this is the result of earlier cleaning) 
		
//tag buildings with at least one complete year
	bys propnum: egen complete=max(numqts)
	/*
	. tab complete

   complete |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |        316        0.39        0.39
          2 |        776        0.96        1.35
          3 |      2,339        2.89        4.24
          4 |     77,575       95.76      100.00
------------+-----------------------------------
      Total |     81,006      100.00
so we can see that most buildings have a year where they have all quarters*/

	//Generate a measure of utility expenditure for each quarter- this will help in
	//future when we take ratios to generate a yearly measure 
	gen qt1=real_exp_util if quarter==1
	gen qt2=real_exp_util if quarter==2
	gen qt3=real_exp_util if quarter==3
	gen qt4=real_exp_util if quarter==4
	
	*I should ask margaret and Karen why this is the method they decided to pick- 
	*First I some summary statistics of what exactly we are missing 
	bys propnum year: egen qtr1=total(qt1), missing
	bys propnum year: egen qtr2=total(qt2), missing
	bys propnum year: egen qtr3=total(qt3), missing
	bys propnum year: egen qtr4=total(qt4), missing
	
	*Generate a value that tells us which quarter is missing for a particular year-property pair 
	gen miss_qt1=1 if missing(qtr1)
	gen miss_qt2=1 if missing(qtr2)
	gen miss_qt3=1 if missing(qtr3)
	gen miss_qt4=1 if missing(qtr4)
	
	*Count if nothing is missing 
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 1 
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 2
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 3
	count if miss_qt1==. & miss_qt2==. & miss_qt3==1 & miss_qt4==.
	*Count if missing only quarter 4
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==1
	*Count if  missing both q1&q2
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==. & miss_qt4==.
	*Count if  missing both q1&q3
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==1 & miss_qt4==.
	*Count if  missing both q1&q4
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==1
	*count if missing both q2&q3
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==1 & miss_qt4==.
	*count if missing both q2&q4
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==. & miss_qt4==1
	*count if missing both q3&q4
	count if miss_qt1==. & miss_qt2==. & miss_qt3==1 & miss_qt4==1
	*Count if missing q1,q2&q3
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==1 & miss_qt4==.
	*Count if missing q1,q2&q4
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==. & miss_qt4==1
	*Count if missing q1,q3&q4
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==1 & miss_qt4==1
	*Count if missing q2,q3,&q4
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==1 & miss_qt4==1
	
	*Now, checking out the number of estar buildings we have in the data 
	*No of obs with estar rating 
	count if rating!=.
	*No of Obs. that have ever been rated
	bysort propnum: egen everrated=min(rating)
	count if everrated!=.
	*No of obs. that have been rated in the last 5 or 10 years 
	count if ind_cert_last5==1
	count if ind_cert_last10==1
	*No ob obs that have never been estar rated 
	count if everrated==.
	*No of unique propnum in the data set- ie the number of unique buildings in the data set 
	bysort propnum: gen uniquebuilding=1 if _n==1
	*Find the number of builings that have ever/never been certified 
	gen buildingevercert=1 if uniquebuilding==1 & everrated!=.
	count if buildingevercert==1
	gen buildingnevercert=1 if uniquebuilding==1 & everrated==.
	count if buildingnevercert==1
	
	{
	*now, I want to see how bad the problem is when we compress the 
	*data into annual observations, I will do the above analysis for the anuual data below 
	
	preserve
	collapse (min) miss_qt1 miss_qt2 miss_qt3 miss_qt4 rating last_year_cert ///
	years_since_cert ind_cert_last5 ind_cert_last10, by(propnum year)
	*Count if nothing is missing 
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 1 
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 2
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==. & miss_qt4==.
	*Count if missing only quarter 3
	count if miss_qt1==. & miss_qt2==. & miss_qt3==1 & miss_qt4==.
	*Count if missing only quarter 4
	count if miss_qt1==. & miss_qt2==. & miss_qt3==. & miss_qt4==1
	*Count if  missing both q1&q2
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==. & miss_qt4==.
	*Count if  missing both q1&q3
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==1 & miss_qt4==.
	*Count if  missing both q1&q4
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==. & miss_qt4==1
	*count if missing both q2&q3
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==1 & miss_qt4==.
	*count if missing both q2&q4
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==. & miss_qt4==1
	*count if missing both q3&q4
	count if miss_qt1==. & miss_qt2==. & miss_qt3==1 & miss_qt4==1
	*Count if missing q1,q2&q3
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==1 & miss_qt4==.
	*Count if missing q1,q2&q4
	count if miss_qt1==1 & miss_qt2==1 & miss_qt3==. & miss_qt4==1
	*Count if missing q1,q3&q4
	count if miss_qt1==1 & miss_qt2==. & miss_qt3==1 & miss_qt4==1
	*Count if missing q2,q3,&q4
	count if miss_qt1==. & miss_qt2==1 & miss_qt3==1 & miss_qt4==1
	
	*Now, checking out the number of estar buildings we have in the data 
	*No of obs with estar rating 
	count if rating!=.
	*No of Obs. that have ever been rated
	bysort propnum: egen everrated=min(rating)
	count if everrated!=.
	*No of obs. that have been rated in the last 5 or 10 years 
	count if ind_cert_last5==1
	count if ind_cert_last10==1
	*No ob obs that have never been estar rated 
	count if everrated==.
	*No of unique propnum in the data set- ie the number of unique buildings in the data set 
	bysort propnum: gen uniquebuilding=1 if _n==1
	count if uniquebuilding==1
	*Find the number of builings that have ever/never been certified 
	gen buildingevercert=1 if uniquebuilding==1 & everrated!=.
	count if buildingevercert==1
	gen buildingnevercert=1 if uniquebuilding==1 & everrated==.
	count if buildingnevercert==1
	restore
	}


*Now the goal is to generate a measure of real yearly utility expenditure. 

*First, if all of the quarter observations in the data exist, sum them 
	bys propnum year: egen yr_exp_util=total(real_exp_util) if numqts==4
	
save "$GenData\NCREIF_wmissingquartersinfo.dta", replace 
/*Now, if not all of the quarterly observations exist take the ratio of the 
*real utility expenditures, then take the median observation 
for all of the observations for each propnum over the years*/
set more off	
forvalues i=1/4 {	
	forvalues j=1/4 {
	gen qt`i'_`j'=qtr`i'/qtr`j' 
	replace qt`i'_`j'=. if qtr`i'<=0 | qtr`j'<=0
	}
	}
	

	//use median ratio to avoid pull of outliers
	forvalues i=1/4 {
		forvalues j=1/4 {
		bysort propnum: egen med`i'_`j'=median(qt`i'_`j') 
		}
		}
*Now, if there is at least 1 complete year for a given building (marked by complete==4)
*then we will be able to use the median values to produce a measure of the missing quarters

	*If 1 quarter is missing:
	*Missing q1 
	replace 	yr_exp_util=(qtr2*med1_2+qtr3*med1_3+qtr4*med1_4)/3+qtr2+qtr3+qtr4 ///
				if qtr2!=. & qtr3!=. & qtr4!=. & numqts==3 & complete==4
	*Missing q2
	replace 	yr_exp_util=qtr1+(qtr1*med2_1+qtr3*med2_3+qtr4*med2_4)/3+qtr3+qtr4 ///
				if qtr1!=. & qtr3!=. & qtr4!=. & numqts==3 & complete==4
	*Missing q3 
	replace 	yr_exp_util=qtr1+qtr2+(qtr1*med3_1+qtr2*med3_2+qtr4*med3_4)/3+qtr4 ///
				if qtr1!=. & qtr2!=. & qtr4!=. & numqts==3 & complete==4
				
	*Missing q4 
	replace 	yr_exp_util=qtr1+qtr2+qtr3+(qtr1*med4_1+qtr2*med4_2+qtr3*med4_3)/3 ///
				if qtr1!=. & qtr2!=. & qtr3!=. & numqts==3 & complete==4
				
	*If 2 quarters are missing 
	*Missing q1&q2
	replace 	yr_exp_util=(qtr3*med1_3+qtr4*med1_4)/2+(qtr3*med2_3+qtr4*med2_4)/2+qtr3+qtr4 ///
				if qtr3!=. & qtr4!=. & numqts==2 & complete==4
	*missing q1&q3
	replace		yr_exp_util=(qtr2*med1_2+qtr4*med1_4)/2+qtr2+(qtr2*med3_2+qtr4*med3_4)/2+qtr4 ///
				if qtr2!=. & qtr4!=. & numqts==2 & complete==4
	*missing q1&q4
	replace 	yr_exp_util=(qtr2*med1_2+qtr3*med1_3)/2+qtr2+qtr3+(qtr2*med4_2+qtr3*med4_3)/2 ///
				if qtr2!=. & qtr3!=. & numqts==2 & complete==4			
	*missing q2&q3
	replace 	yr_exp_util=qtr1+(qtr1*med2_1+qtr4*med2_4)/2+(qtr1*med3_1+qtr4*med3_4)/2+qtr4 ///
				if qtr1!=. & qtr4!=. & numqts==2 & complete==4
	*missing q3&q4
	replace 	yr_exp_util=qtr1+qtr2+(qtr1*med3_1+qtr2*med3_2)/2+(qtr1*med4_1+qtr2*med4_2)/2 ///
				if qtr1!=. & qtr2!=. & numqts==2 & complete==4
	*missing q2&q4
	replace		yr_exp_util=qtr1+(qtr1*med2_1+qtr3*med2_3)/2+qtr3+(qtr1*med4_1+qtr3*med4_3)/2 ///
				if qtr1!=. & qtr3!=. & numqts==2 & complete==4
				
	*If 3 quarters are missing 
	*Have q1
	replace 	yr_exp_util=qt1+qt1*med2_1+qt1*med3_1+qt1*med4_1 ///
				if qt1!=. & numqts==1 & complete==4
	*Have q2
	replace 	yr_exp_util=qt2+qt2*med1_2+qt2*med3_2+qt2*med4_2 ///
				if qt2!=. & numqts==1 & complete==4
	*Have q3
	replace 	yr_exp_util=qt3+qt3*med2_3+qt3*med1_3+qt3*med4_3 ///
				if qt3!=. & numqts==1 & complete==4
	*Have q4
	replace 	yr_exp_util=qt4+qt4*med2_4+qt4*med3_4+qt4*med1_4 ///
				if qt4!=. & numqts==1 & complete==4
				
*now, there are 3431 observations that don't have a yearly observation, 
*this is because the obs never has a fully complete year
*Paige makes the choice to use the median value for a particular state 
*not sure if this is the best idea, but I will use it for now and come 
*back later- because by state isn't taking into account sqft so I 
*think this measure will be pretty off- ask Karen and Margaret 

//state ratio for props without a complete year
		drop med*
			forvalues i=1/4 {
		forvalues j=1/4 {
		bys state year: egen med`i'_`j'=median(qt`i'_`j')
		}
		}
				
	*If 1 quarter is missing:
	*Missing q1 
	replace 	yr_exp_util=(qtr2*med1_2+qtr3*med1_3+qtr4*med1_4)/3+qtr2+qtr3+qtr4 ///
				if qtr2!=. & qtr3!=. & qtr4!=. & numqts==3 & yr_exp_util==.
	*Missing q2
	replace 	yr_exp_util=qtr1+(qtr1*med2_1+qtr3*med2_3+qtr4*med2_4)/3+qtr3+qtr4 ///
				if qtr1!=. & qtr3!=. & qtr4!=. & numqts==3 & yr_exp_util==.
	*Missing q3 
	replace 	yr_exp_util=qtr1+qtr2+(qtr1*med3_1+qtr2*med3_2+qtr4*med3_4)/3+qtr4 ///
				if qtr1!=. & qtr2!=. & qtr4!=. & numqts==3 & yr_exp_util==.
				
	*Missing q4 
	replace 	yr_exp_util=qtr1+qtr2+qtr3+(qtr1*med4_1+qtr2*med4_2+qtr3*med4_3)/3 ///
				if qtr1!=. & qtr2!=. & qtr3!=. & numqts==3 & yr_exp_util==.
				
	*If 2 quarters are missing 
	*Missing q1&q2
	replace 	yr_exp_util=(qtr3*med1_3+qtr4*med1_4)/2+(qtr3*med2_3+qtr4*med2_4)/2+qtr3+qtr4 ///
				if qtr3!=. & qtr4!=. & numqts==2 & yr_exp_util==.
	*missing q1&q3
	replace		yr_exp_util=(qtr2*med1_2+qtr4*med1_4)/2+qtr2+(qtr2*med3_2+qtr4*med3_4)/2+qtr4 ///
				if qtr2!=. & qtr4!=. & numqts==2 & yr_exp_util==.
	*missing q1&q4
	replace 	yr_exp_util=(qtr2*med1_2+qtr3*med1_3)/2+qtr2+qtr3+(qtr2*med4_2+qtr3*med4_3)/2 ///
				if qtr2!=. & qtr3!=. & numqts==2 & yr_exp_util==.			
	*missing q2&q3
	replace 	yr_exp_util=qtr1+(qtr1*med2_1+qtr4*med2_4)/2+(qtr1*med3_1+qtr4*med3_4)/2+qtr4 ///
				if qtr1!=. & qtr4!=. & numqts==2 & yr_exp_util==.
	*missing q3&q4
	replace 	yr_exp_util=qtr1+qtr2+(qtr1*med3_1+qtr2*med3_2)/2+(qtr1*med4_1+qtr2*med4_2)/2 ///
				if qtr1!=. & qtr2!=. & numqts==2 & yr_exp_util==.
	*missing q2&q4
	replace		yr_exp_util=qtr1+(qtr1*med2_1+qtr3*med2_3)/2+qtr3+(qtr1*med4_1+qtr3*med4_3)/2 ///
				if qtr1!=. & qtr3!=. & numqts==2 & yr_exp_util==.
				
	*If 3 quarters are missing 
	*Have q1
	replace 	yr_exp_util=qt1+qt1*med2_1+qt1*med3_1+qt1*med4_1 ///
				if qt1!=. & numqts==1 & yr_exp_util==.
	*Have q2
	replace 	yr_exp_util=qt2+qt2*med1_2+qt2*med3_2+qt2*med4_2 ///
				if qt2!=. & numqts==1 & yr_exp_util==.
	*Have q3
	replace 	yr_exp_util=qt3+qt3*med2_3+qt3*med1_3+qt3*med4_3 ///
				if qt3!=. & numqts==1 & yr_exp_util==.
	*Have q4
	replace 	yr_exp_util=qt4+qt4*med2_4+qt4*med3_4+qt4*med1_4 ///
				if qt4!=. & numqts==1 & yr_exp_util==.
				
	*Now, the only observation missing the yearly data is in Charleston, WV
	*This is because this is the only observation for WV 
	
*make the var of interest- yearly real utility expenditure per square foot 
	bysort propnum year: egen yr_exp_util_max=max(yr_exp_util)
	bysort propnum year: egen yr_exp_util_mode=mode(yr_exp_util)
	br if yr_exp_util_mode!=yr_exp_util_max
	by propnum year: egen sqft_max=max(sqft)
	by propnum year: egen sqft_mode=mode(sqft), max
	br if sqft_mode!=sqft_max
	by propnum year: egen sqft_min=min(sqft)
	br if sqft_mode!=sqft_min
	br if sqft_min!=sqft_max
	*If the observations don't have the same square footage in a year, take the mode
	*Check with Margaret and Karen about this 
	replace sqft=sqft_mode 
	*This replaces the information for 3525 observations 
	gen rl_yr_utilpersf=yr_exp_util/sqft
	
	*there are only 5 observation that fall aboove the 12.50 cut-off that is 
	*described in the paper by Karen and Margaret, and of these, they all either 
	*only have 1 or 2 quarters of data, but I am still not convinced that the 
	*state method is actually a good way to project the data when the building 
	*doesn't have at least one clean year- absolutely want to revist this 
	**Tomorrow, i want to make more summary statistics on the real util expenditures when 
	*looking at whether the buildings that are cert really do have a lower utility exp
	
*Collapse the data into annual observations 
*Currently, not worrying about any of the control variables- I can figure out 
*how we should collapse these once we decide whether we want to collapse the data before or 
*after we clean the NCREIF data 

*Check to make sure real util per sf is the same for all obs in a year 
	bysort propnum year: egen utilpersf_mode=mode(rl_yr_utilpersf), max
	bysort propnum year: egen utilpersf_max=max(rl_yr_utilpersf)
	count if utilpersf_max!=utilpersf_mode
	*no observations 
	*capture drop sqft_mode sqft_min sqft_max qt* med* miss* qtr* complete ///
		*non0 non0qts firstobs dup_qtr lasy_year_cert 
	*collapse (min) rating last_year_cert years_since_cert ind_cert_last5 ///
	*ind_cert_last10 rl_yr_utilpersf, by(propnum year)
	
	drop if rl_yr_utilpersf>12.50

*Now, generating some summary statistics on real utility per sqft based on 
*whether buildings are cert or not
	su rl_yr_utilpersf, detail
	su rl_yr_utilpersf if rating!=., detail
	su rl_yr_utilpersf if rating==., detail
	su rl_yr_utilpersf if ind_cert_last5==1, detail
	su rl_yr_utilpersf if ind_cert_last5==., detail
	su rl_yr_utilpersf if ind_cert_last10==1, detail
	su rl_yr_utilpersf if ind_cert_last10==., detail
	*Now I want builiding average information- ie average util per sf
	*if the building has ever been certified versus never been certified 
	*if a building has ever been rated
	su rl_yr_utilpersf if everrated!=.
	su rl_yr_utilpersf if everrated==.
	*Generate a unique building id
	preserve
	by propnum: egen building_avg=mean(rl_yr_utilpersf)
	drop if uniquebuilding!=1
	su rl_yr_utilpersf if everrated!=.
	su rl_yr_utilpersf if everrated==.
	restore
	
	by propnum: egen firstyearrated=min(last_year_cert)
	*trying to find the mean of the property before and after 
	*certification 
	count if firstyearrated!=.
	su rl_yr_utilpersf if !missing(firstyearrated) & year<firstyearrated, detail
	su rl_yr_utilpersf if !missing(firstyearrated) & year>=firstyearrated, detail
	
		*Making historgrams 
	*ssc describe eqprhistogram
	*ssc install eqprhistogram
	*Full sample histogram 
	eqprhistogram rl_yr_utilpersf, bin(10) ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("Full Sample") bgcolor(white) graphregion(color(white))
	*Histogram if certified in last 5 years
	preserve
	drop if ind_cert_last5==.
	eqprhistogram rl_yr_utilpersf, bin(10) ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("Certified in Last 5 Years") bgcolor(white) graphregion(color(white))
	restore
	
	*Histogram if not certified in last 5 years 
	preserve 
	drop if ind_cert_last5==.
	eqprhistogram rl_yr_utilpersf, bin(10) ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("Not Certified in Last 5 Years") bgcolor(white) graphregion(color(white))
	restore
	
	
	*Histogram of data before properties have been certified 
	preserve 
	keep if !missing(firstyearrated) & year<firstyearrated
	eqprhistogram rl_yr_utilpersf, bin(10) ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("Before Certification") bgcolor(white) graphregion(color(white))
	restore
	
	*Histogram of data after properties have been certified 
	preserve 
	keep if !missing(firstyearrated) & year>=firstyearrated
	eqprhistogram rl_yr_utilpersf, bin(10)  ///
	plot(kdensity rl_yr_utilpersf, biweight width(5) lcolor(navy) ///
	lwidth(medthick)) title("After Certification") bgcolor(white) graphregion(color(white))
	restore
	
}
